library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(rainbow)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: pcaPP

Importing data

x400_nm-x700_nm: wavelength of light eqsref: meat sample retail: pre or post retail when sample is tested amc: meat colour categorisation (1:5) date: date of sample being tested Eliminate missing values.

x = read_csv("data/AtoPandTcmc.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   eqsref = col_character(),
##   works.body.no. = col_character(),
##   hunter.log = col_character(),
##   product = col_character(),
##   setup = col_integer(),
##   timestamp = col_datetime(format = ""),
##   stdtype = col_character(),
##   viewtype = col_character(),
##   avg = col_integer(),
##   illobs = col_character(),
##   scale = col_character(),
##   index = col_character(),
##   days.from.kill = col_integer(),
##   file = col_character(),
##   meter = col_character(),
##   cut = col_character(),
##   sequence = col_character(),
##   pack = col_character(),
##   day = col_integer(),
##   retail = col_character()
##   # ... with 14 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 48 parsing failures.
## row # A tibble: 5 x 5 col     row            col               expected actual expected   <int>          <chr>                  <chr>  <chr> actual 1  2629 days.from.kill no trailing characters     .5 file 2  2630 days.from.kill no trailing characters     .5 row 3  2631 days.from.kill no trailing characters     .5 col 4  2632 days.from.kill no trailing characters     .5 expected 5  2633 days.from.kill no trailing characters     .5 actual # ... with 1 more variables: file <chr>
## ... ................. ... .................................................... ........ .................................................... ...... .................................................... .... .................................................... ... .................................................... ... .................................................... ........ .................................................... ...... .......................................
## See problems(...) for more details.
x = janitor::clean_names(x)
x = x %>% mutate(date = as_date(timestamp))
fdata = x %>% dplyr::select(x400_nm:x700_nm, eqsref, retail, amc, date) %>% 
  na.omit() 
fdata
## # A tibble: 1,521 x 35
##    x400_nm x410_nm x420_nm x430_nm x440_nm x450_nm x460_nm x470_nm x480_nm
##      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1  0.8015  0.2989  0.2657  0.6006  1.7830  3.3685  5.1385  6.4553  8.0122
##  2  1.0129  0.3810  0.3398  1.0736  2.5370  4.4822  6.6183  8.7175 10.4712
##  3  0.6970  0.5675  0.4944  0.8737  1.6431  2.6197  3.6436  4.6615  5.6152
##  4  1.6878  1.2644  1.1954  1.5006  2.0944  2.9011  3.8514  4.8571  5.7873
##  5  3.0329  2.5885  2.7278  3.2587  4.2532  5.4271  6.5910  7.5594  8.3951
##  6  2.8820  2.4600  2.5460  2.8640  3.6640  4.7570  5.9070  7.0680  8.2290
##  7  0.5773  0.4198  0.3530  0.7652  1.5848  2.6497  3.7947  4.8870  5.9045
##  8  0.6886  0.3881  0.3365  0.6307  1.2258  2.0512  3.0328  4.0997  5.0773
##  9  0.3129  0.0836  0.1036  0.5900  1.5801  2.8377  4.1189  5.2856  6.2268
## 10  1.5288  0.9624  0.9451  1.3350  2.2800  3.4619  4.7747  6.1071  7.3239
## # ... with 1,511 more rows, and 26 more variables: x490_nm <dbl>,
## #   x500_nm <dbl>, x510_nm <dbl>, x520_nm <dbl>, x530_nm <dbl>,
## #   x540_nm <dbl>, x550_nm <dbl>, x560_nm <dbl>, x570_nm <dbl>,
## #   x580_nm <dbl>, x590_nm <dbl>, x600_nm <dbl>, x610_nm <dbl>,
## #   x620_nm <dbl>, x630_nm <dbl>, x640_nm <dbl>, x650_nm <dbl>,
## #   x660_nm <dbl>, x670_nm <dbl>, x680_nm <dbl>, x690_nm <dbl>,
## #   x700_nm <dbl>, eqsref <chr>, retail <chr>, amc <int>, date <date>

x400_nm-x700_nm are the wavelengths of light, hence they can be viewed as one variable but spread across multiple column. Gather from tidyr package mkes them into one new column under a single variable name wavelength. Transform values in wavelength from character to numbers.

fd = gather(fdata, key = wavelength, value = intensity, -eqsref, -retail, -amc, -date)
fd = fd %>% mutate(
  wavelength = substr(wavelength, start=2, stop=4),
  wavelength = as.numeric(wavelength)) 
fd
## # A tibble: 47,151 x 6
##    eqsref retail   amc       date wavelength intensity
##     <chr>  <chr> <int>     <date>      <dbl>     <dbl>
##  1   A0A5    pre     0 2015-09-12        400    0.8015
##  2   A0A5   post     0 2015-09-21        400    1.0129
##  3   A0Q3   post     3 2015-09-28        400    0.6970
##  4   A0Q3    pre     3 2015-09-19        400    1.6878
##  5   A0R5   post     3 2000-01-03        400    3.0329
##  6   A0R5    pre     2 2015-10-17        400    2.8820
##  7   A0S0   post     2 2015-09-28        400    0.5773
##  8   A0S0    pre     2 2015-09-19        400    0.6886
##  9   A0Y5   post     1 2015-09-21        400    0.3129
## 10   A0Y5    pre     1 2015-09-19        400    1.5288
## # ... with 47,141 more rows

Plot fd in lines and grouped by amc value.

ggplot(fd, aes(x = wavelength, y = intensity, group = interaction(eqsref,retail), colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

fdd = fd %>% group_by(eqsref,retail,date) %>% 
  mutate(intensity = intensity - median(intensity))

ggplot(fdd, aes(x = wavelength, y = intensity, 
                group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_grid(retail~amc)

Notice that there are two patterns for the curves. One is smoother after 600 and another is more like zigzag.

Replot with pre-retail data only

fd %>% filter(retail == "pre") %>% 
  ggplot(aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

Replot with post-retail data only

fd %>% filter(retail == "post") %>% 
  ggplot(aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

There is another interesting pattern in the graph. A group of curves shift upward vertically from the others as the datas are sampled from different dates.

fd %>% group_by(retail, date) %>% summarise
## # A tibble: 7 x 2
## # Groups:   retail [?]
##   retail       date
##    <chr>     <date>
## 1   post 2000-01-03
## 2   post 2015-09-14
## 3   post 2015-09-21
## 4   post 2015-09-28
## 5    pre 2015-09-12
## 6    pre 2015-09-19
## 7    pre 2015-10-17
fd %>% group_by(amc) %>% summarise(n = n())
## # A tibble: 6 x 2
##     amc     n
##   <int> <int>
## 1     0  1643
## 2     1  9486
## 3     2 12958
## 4     3 12307
## 5     4  9579
## 6     5  1178

Only one type of retail status is tested each day. amc = 0 and amc = 5 have significantly smaller sample space. Hence only amc 1:4 will be analysed.

To be able to identify the outliers better, data of each retail status, amc and date will be plotted using fboxplot from rainbow package.

In order to use iteration, separate the datas into pre-retail and post-retail.

fd_pre = fd %>% filter(retail == 'pre')
fd_post = fd %>% filter(retail == 'post')

Function to filter datas by amc and date

data_filter = function(dt, amc_val, date_val) {
  dt %>% filter(amc == amc_val, date == date_val)
  }

Function to crate a fds from the data In order to use fboxplot, data must be either fds or fts. Wavelength is on the x-axis. Intensity is on the y-axis. spread can convert the data into the accepted dimenson of the y argument in fds as each column is the intensity of different wavelength from one sample.

data_to_fds = function(dt){
  spread_data = function(dt){
    dt %>% dplyr::select(wavelength, intensity, eqsref) %>% spread(key = eqsref, value = intensity)
  } 
  fds(x = seq(400, 700, 10), y = as.matrix(spread_data(dt)[,-1]), xname = 'wavelength', yname = 'intensity')
}

Function to transform data into boxplot

data_to_plot = function(dt){
  data_to_fds(dt) %>% 
    fboxplot(plot.type = 'functional', type = 'bag', projmethod = 'PCAproj',
             xlab = 'wavelength', ylab = 'intensity', legendpos = 'topleft')
}

data_to_plot_biv = function(dt){
  data_to_fds(dt) %>% 
    fboxplot(plot.type = 'bivariate', type = 'bag', projmethod = 'PCAproj',
             xlab = 'wavelength', ylab = 'intensity', legendpos = 'topleft')
}

For loop to make boxplot graphs for every combination of amc 1:4 and dates that are pre-retailed.

for (i in c(1:5)){
  for (j in c('2015-09-12', '2015-09-19', '2015-10-17')){
    #par(mfrow = c(1,2),cex = 0.75)
    fd_pre %>% data_filter(i, j) %>% data_to_plot()
    title(paste('amc ', i, ',', 'date ', j))
    #fd_pre %>% data_filter(i, j) %>% data_to_plot_biv()
    #title(paste('amc ', i, ',', 'date ', j))
  }
}

Repeat for samples that are post-retailed.

for (i in c(1:4)){
  for (j in c('2000-01-03', '2015-09-14', '2015-09-21', '2015-09-28')){
    fd_post %>% data_filter(i, j) %>% data_to_plot()
    title(paste('amc ', i, ',', 'date ', j))
  }
}

The curves that are coloured represent outliers.

To combine data from different dates, instead of ploting intensity on the y axis, we are going to plot the first derivative of intenstiy, which is the difference in intensity between the two consecutive wavelengths. Since there is no data of wavelength at 710, the x value is changed from 400-700 to 400-690.

fdata_diff = fdata

for (i in c(1:30)){fdata_diff[i] = fdata[i + 1] - fdata[i]}

fdata_diff = fdata_diff %>% dplyr::select(eqsref, amc, date, retail, x400_nm:x690_nm)
fd_diff = fdata_diff %>% gather(key = wavelength, value = intensity, -eqsref, -retail, -amc, -date)
fd_diff = mutate(fd_diff, wavelength = as.numeric(substr(wavelength, start=2, stop=4)))
fd_diff
## # A tibble: 45,630 x 6
##    eqsref   amc       date retail wavelength intensity
##     <chr> <int>     <date>  <chr>      <dbl>     <dbl>
##  1   A0A5     0 2015-09-12    pre        400   -0.5026
##  2   A0A5     0 2015-09-21   post        400   -0.6319
##  3   A0Q3     3 2015-09-28   post        400   -0.1295
##  4   A0Q3     3 2015-09-19    pre        400   -0.4234
##  5   A0R5     3 2000-01-03   post        400   -0.4444
##  6   A0R5     2 2015-10-17    pre        400   -0.4220
##  7   A0S0     2 2015-09-28   post        400   -0.1575
##  8   A0S0     2 2015-09-19    pre        400   -0.3005
##  9   A0Y5     1 2015-09-21   post        400   -0.2293
## 10   A0Y5     1 2015-09-19    pre        400   -0.5664
## # ... with 45,620 more rows

Plot new fd into lines.

ggplot(fd_diff, aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

fd_diff %>% filter(retail == "pre") %>% 
  ggplot(aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

fd_diff %>% filter(retail == "post") %>% 
  ggplot(aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

Repeat rainbow plot for the new functional data set. However, since the x value range is smaller, new functions that transform the data frame to fds and to boxplot need to be written.

bagplot gives error message when plotting pre amc = 3 and post amc = 2 and 3. Hence another function that plot hdrplot is also written. In bagplot the light grey region represents 99% of the data when in hdr plot it represents 93% of the data, which causes diffrerence in outliers that are identified by two different plots.

data_to_fds2 = function(dt){
  spread_data = function(dt){
    dt %>% dplyr::select(wavelength, intensity, eqsref) %>% spread(key = eqsref, value = intensity)
  } 
  fds(x = seq(400, 690, 10), y = as.matrix(spread_data(dt)[,-1]), xname = 'wavelength', yname = 'intensity')
}

#bagplot
data_to_plot2 = function(dt){
  data_to_fds2(dt) %>% 
    fboxplot(plot.type = 'functional', type = 'bag', projmethod = 'PCAproj', na.rm = TRUE,
             xlab = 'wavelength', ylab = 'intensity', legendpos = 'topleft')
}

#hdr plot
data_to_plot3 = function(dt){
  data_to_fds2(dt) %>% 
    fboxplot(plot.type = 'functional', type = 'hdr', projmethod = 'PCAproj',
             xlab = 'wavelength', ylab = 'intensity', legendpos = 'topleft')
}

fd_diff_pre = fd_diff %>% filter(retail == 'pre')
fd_diff_post = fd_diff %>% filter(retail == 'post')

In bagplot the light grey region represents 99% of the data when in hdr plot it represents 93% of the data, which causes diffrerence in outliers that are identified by two different plots.

fd_diff_pre %>% filter(amc == 1) %>% data_to_plot2

fd_diff_pre %>% filter(amc == 1) %>% data_to_plot3

Iterate to plot for each amc from pre-retail data.

par(mfrow = c(1, 1))
for (i in c(0:5)) {fd_diff_pre %>% filter(amc == i) %>% data_to_plot3
  title(paste('pre, ', 'amc ', i))}

Iterate to plot for each amc from post-retail data.

fd_diff_post = fd_diff %>% filter(retail == 'post')

for (i in c(0:5)) {fd_diff_post %>% filter(amc == i) %>% data_to_plot3
  title(paste('post, ', 'amc ', i))}

Comparing to pre-retail graphs, the pattern of post-retails at wavelength 650 is hard to be determined. Some curves behave more like pre-retail data. It is possible that it is related to the number of days between the two tests each sample took.

#each sample's pre/post retail date
fd_date = fd %>% group_by(eqsref, date, retail) %>% summarise %>%
  spread( key = retail, value = date) %>% dplyr::select(eqsref, pre, post) %>% arrange(pre)

fd_date = fd_date %>% mutate(days = post - pre)

fd_date
## # A tibble: 864 x 4
## # Groups:   eqsref [864]
##    eqsref        pre       post   days
##     <chr>     <date>     <date> <time>
##  1   A0A5 2015-09-12 2015-09-21 9 days
##  2   A1D9 2015-09-12 2015-09-14 2 days
##  3   A1S5 2015-09-12 2015-09-21 9 days
##  4   A3B8 2015-09-12 2015-09-21 9 days
##  5   A4S5 2015-09-12 2015-09-21 9 days
##  6   A4T8 2015-09-12 2015-09-14 2 days
##  7   A5Q2 2015-09-12 2015-09-21 9 days
##  8   A5W4 2015-09-12 2015-09-21 9 days
##  9   A6C8 2015-09-12 2015-09-21 9 days
## 10   A6X8 2015-09-12 2015-09-14 2 days
## # ... with 854 more rows

After cross comparing the outliers that are identified by intensity plots and difference plots, a vector of outlier and a new data set without these outliers are built.

#cross comparing outliers
outlier = c('C2M9', 'J4S1', 'M3T0', 'Y9G2', 'U2L1', 'H9P5', 'H5A5', 'K9M5', 
            'Q9D7', 'T4Y5', 'U2C4', 'X1A2', 'E9C9', 'M8E6', 'R7C9', 'E0F8', 
            'R2H2', 'U5H1', 'U6F0', 'J1T9', 'Y9V6', 'U8A0')

#data without outliers
fdata2 = fdata %>% filter(retail == 'pre', !(eqsref %in% outlier))
fd2 = fdata2 %>% gather(key = wavelength, value = intensity, -eqsref, -retail, -amc, -date)
fd2 = fd2 %>% mutate(wavelength = substr(wavelength, start = 2, stop = 4),
                     wavelength = as.numeric(wavelength))
ggplot(fd2, aes(x = wavelength, y = intensity, group = eqsref, colour = as.factor(date))) +
  geom_line(alpha = 0.5) +
  facet_wrap(~amc)

Bagplot of intensity by different amc

for (i in c(0:5)){
  for (j in c('2015-09-12', '2015-09-19', '2015-10-17')){
    #par(mfrow = c(1,2),cex = 0.75)
    fd2 %>% filter(date == j) %>% data_to_plot()
    title(paste('amc ', i, ',', 'date ', j))
    #fd_pre %>% data_filter(i, j) %>% data_to_plot_biv()
    #title(paste('amc ', i, ',', 'date ', j))
  }
}

new intensity difference data without the outliers

fdata2_diff = fdata2

for (i in c(1:30)){fdata2_diff[i] = fdata2[i] - fdata2[i + 1]}

fdata2_diff = fdata2_diff %>% dplyr::select(eqsref, amc, date, retail, x400_nm:x690_nm)
fd2_diff = fdata2_diff %>% gather(key = wavelength, value = intensity, -eqsref, -retail, -amc, -date)
fd2_diff = mutate(fd2_diff, wavelength = as.numeric(substr(wavelength, start=2, stop=4)))

for (i in c(0:5)) {fd2_diff %>% filter(amc == i) %>% data_to_plot3
  title(paste('pre, ', 'amc ', i))}